

# Notes ------------------------------------------------------------------------
# Goal
#   Calculate heating degree days (HDD) and cooling degree days (CDD)
#   by day and county (for the continental US).
# Data
#   (1) 2010 Census shapefile of county outlines
#   (2) Average daily temperature data NCDFs from NOAA's NARR


# Setup ------------------------------------------------------------------------
  # Options
  options(stringsAsFactors = F)
  # Packages
  library(data.table)
  library(magrittr)
  library(rgeos)
  library(maps)
  library(rgdal)
  library(raster)
  library(parallel)
  library(lubridate)
  library(stringr)
  library(maptools)
  library(ggplot2)
  library(ggthemes)
  library(broom)
  # Directories
  dir_project <- "/Users/edwardarubin/Dropbox/Research/MyProjects/NaturalGas/"
  dir_csv     <- paste0(dir_project, "DataCsv/")
  dir_output  <- paste0(dir_project, "Output/")
  dir_r       <- paste0(dir_project, "DataR/")
  dir_spatial <- paste0(dir_project, "DataSpatial/")
  dir_temp    <- paste0(dir_spatial, "NARRTempDaily/")


# Load data --------------------------------------------------------------------
  # Load the first day of the 1988 daily temperature data from NARR
  temp_sample <- raster(paste0(dir_temp, "air.sfc.1988.nc"))
  # Load US county shapefile
  us_shp <- readOGR(dsn = paste0(dir_spatial, "gz_2010_us_050_00_500k"),
    layer = "gz_2010_us_050_00_500k")
  # Use the state.fips dataset from the maps package to find continental
  # states' FIPS codes
  state_fips <- data.table(state.fips)[, .(fips, abb)] %>% unique()
  state_fips[, fips := str_pad(fips, 2, "left", 0)]
  # Drop Alaska, Hawaii, and Puerto Rico
  us_shp <- subset(us_shp, STATE %in% state_fips$fips)


# Project objects to match CRS -------------------------------------------------
  #NOTE: I commented out the plots after checking them
  # Re-project the shapefile to match the NARR data's CRS
  us_shp <- spTransform(us_shp, temp_sample@crs)
  # Check that everything matches
  #plot(crop(temp_sample[[1]], extent(us_shp)),
  #  col = colorRamps::matlab.like2(100))
  #plot(us_shp, add = T)


# Crop objects -----------------------------------------------------------------
  # Find extent of buffered shapefile
  us_buff <- us_shp %>% gUnaryUnion() %>% gBuffer(., width = 1e4)
  # Crop temperature raster to extent of buffered US shapefile
  temp_sample <- crop(temp_sample, extent(us_buff))


# Find cell weights for each county --------------------------------------------
  # Copy the sample raster
  sample_raster <- copy(temp_sample)
  # Change cell values to number from 1 to the number of cells
  sample_raster %<>% setValues(., 1:ncell(.))
  # Find cell weights needed to calculate county-level summary statistics
  cell_weights <- extract(x = sample_raster, y = us_shp,
    small = T, weights = T, normalizeWeights = T)
  # Build crosswalk between cell IDs and cell coordinates
  xwalk <- as.data.frame(sample_raster, xy = T) %>% data.table()
  setnames(xwalk, "Daily.air.temperature.at.Surface", "id")
  # Save the cell weights and xwalk
  saveRDS(object = cell_weights,
    file = paste0(dir_r, "DegreeDaysNARR/cellWeights.rds"))
  saveRDS(object = xwalk,
    file = paste0(dir_r, "DegreeDaysNARR/cellXwalk.rds"))


# Calculate county-level daily summaries (for each year's file) ----------------
  # Load the cell weights and the cell xwalk
  cell_weights <- readRDS(
    file = paste0(dir_r, "DegreeDaysNARR/cellWeights.rds"))
  xwalk <- readRDS(
    file = paste0(dir_r, "DegreeDaysNARR/cellXwalk.rds"))
  # Loop over the years (1979 to 2015)
  for(the_year in 1979:2015) {
    # Print the year and start the timer
    print(the_year)
    start_time <- proc.time()
    # Load the given year's temperature data
    tmp_year  <- brick(paste0(dir_temp, "air.sfc.", the_year, ".nc"))
    # Crop the temperature brick to the extent of buffered US shapefile
    tmp_year  <- crop(tmp_year, extent(us_buff))
    # Convert temperatures to F (degree days are in F)
    tmp_year <- tmp_year * 9 / 5 - 459.67
    # Calculate heating and cooling degree days (based upon 65F cutoff)
    hdd_year <- (tmp_year < 65) * (65 - tmp_year)
    cdd_year <- (tmp_year > 65) * (tmp_year - 65)
    # Convert the rasters to data.tables
    tmp_dt <- tmp_year %>% as.data.frame(xy = T) %>% data.table()
    hdd_dt <- hdd_year %>% as.data.frame(xy = T) %>% data.table()
    cdd_dt <- cdd_year %>% as.data.frame(xy = T) %>% data.table()
    # For each county i
    #   1. Convert county i's weights to data.table
    #   2. Merge the weights with the crosswalk (by ID) to add coordinates
    #   3. Merge the cell weights onto the weather data (by coordinates)
    #   4. Calculate weighted means
    #   5. Join weather data to a single object
    year_dt <- mclapply(X = seq_along(cell_weights), FUN = function(i) {
      # Convert the county's weights to a data.table
      wts_i <- cell_weights[[i]] %>% data.table()
      setnames(wts_i, c("value", "weight"), c("id", "wt"))
      # Add coordinates to the weights (merge with xwalk by cell ID)
      wts_i %<>% merge(y = xwalk, by = "id", all.x = T, all.y = F)
      # Merge county i's cell weights with the weather data by coordinates
      tmp_i <- merge(x = wts_i, y = tmp_dt, by = c("x", "y"),
        all.x = T, all.y = F)
      hdd_i <- merge(x = wts_i, y = hdd_dt, by = c("x", "y"),
        all.x = T, all.y = F)
      cdd_i <- merge(x = wts_i, y = cdd_dt, by = c("x", "y"),
        all.x = T, all.y = F)
      # Drop coordinates, weights, and cell IDs
      tmp_i[, c("x", "y", "id") := NULL]
      hdd_i[, c("x", "y", "id") := NULL]
      cdd_i[, c("x", "y", "id") := NULL]
      # For each non-weight column (the weather columns), update the column
      # by multiplying the current column by the cell's weight (given by "wt")
      for(j in which(names(tmp_i) != "wt")) {
        set(x = tmp_i, j = j, value = tmp_i[[j]] * tmp_i[["wt"]])
      }
      for(j in which(names(hdd_i) != "wt")) {
        set(x = hdd_i, j = j, value = hdd_i[[j]] * hdd_i[["wt"]])
      }
      for(j in which(names(cdd_i) != "wt")) {
        set(x = cdd_i, j = j, value = cdd_i[[j]] * cdd_i[["wt"]])
      }
      # Drop the weight columns
      tmp_i[, wt := NULL]
      hdd_i[, wt := NULL]
      cdd_i[, wt := NULL]
      # Sum by column (finishes the area-weighted mean)
      tmp_i <- data.table(
        date = names(tmp_i),
        temp = colSums(tmp_i) %>% unlist())
      hdd_i <- data.table(
        date = names(hdd_i),
        hdd = colSums(hdd_i) %>% unlist())
      cdd_i <- data.table(
        date = names(cdd_i),
        cdd = colSums(cdd_i) %>% unlist())
      # Fix dates (convert to date and round to nearest day)
      tmp_i[, date := date %>% gsub("^X", "", .) %>%
        ymd_hms %>% round_date(unit = "day")]
      hdd_i[, date := date %>% gsub("^X", "", .) %>%
        ymd_hms %>% round_date(unit = "day")]
      cdd_i[, date := date %>% gsub("^X", "", .) %>%
        ymd_hms %>% round_date(unit = "day")]
      # Merge by date
      final_i <- merge(x = tmp_i, y = hdd_i, by = "date", all = T) %>%
        merge(y = cdd_i, by = "date", all = T)
      # Add FIPS
      final_i[, fips := paste0(us_shp$STATE[i], us_shp$COUNTY[i])]
      # Return final_i
      return(final_i)
      }, mc.cores = 4) %>% rbindlist()
    # Save the annual file
    saveRDS(object = year_dt,
      file = paste0(dir_r, "DegreeDaysNARR/countyWeather", the_year, ".rds"))
    # Print the time taken
    print(timetaken(start_time))
  }


# Check work -------------------------------------------------------------------
  # Load US county shapefile
  us_shp <- readOGR(dsn = paste0(dir_spatial, "gz_2010_us_050_00_500k"),
    layer = "gz_2010_us_050_00_500k")
  # Use the state.fips dataset from the maps package to find continental
  # states' FIPS codes
  state_fips <- data.table(state.fips)[, .(fips, abb)] %>% unique()
  state_fips[, fips := str_pad(fips, 2, "left", 0)]
  # Drop Alaska, Hawaii, and Puerto Rico
  us_shp <- subset(us_shp, STATE %in% state_fips$fips)
  # Load 2015 weather data
  dd_dt <- readRDS(paste0(dir_r, "DegreeDaysNARR/countyWeather2015.rds"))
  dd_dt[, date := as.Date(date)]
  # Add FIPS to counties shapefile
  us_shp$fips <- paste0(us_shp$STATE, us_shp$COUNTY)
  # Prepare data for plot
  us_dt <- tidy(us_shp, region = "fips") %>% data.table()
  # Add 2015-05-01 weather to the shapefile
  us_dt <- merge(x = us_dt, y = dd_dt[date == "2015-05-01"],
    by.x = "id", by.y = "fips", all.x = T, all.y = F)
  # Map heating degree days
  hdd_plot <- ggplot(data = us_dt,
    aes(x = long, y = lat, group = group)) +
    geom_polygon(aes(fill = hdd)) +
    coord_map("lambert", lat0 = 50, lat1 = 50) +
    scale_fill_gradientn("Heating degree days (count)",
      colors = colorRamps::matlab.like(100)) +
    theme_map() +
    theme(
      legend.position = "bottom",
      legend.direction = "horizontal",
      legend.key.width = unit(1, "in"))
  ggsave(filename = "plotHDD.jpg", plot = hdd_plot, path = dir_output,
    width = 10, height = 8)
  # Map cooling degree days
  cdd_plot <- ggplot(data = us_dt,
    aes(x = long, y = lat, group = group)) +
    geom_polygon(aes(fill = cdd)) +
    coord_map("lambert", lat0 = 50, lat1 = 50) +
    scale_fill_gradientn("Cooling degree days (count)",
      colors = colorRamps::matlab.like(100)) +
    theme_map() +
    theme(
      legend.position = "bottom",
      legend.direction = "horizontal",
      legend.key.width = unit(1, "in"))
  ggsave(filename = "plotCDD.jpg", plot = cdd_plot, path = dir_output,
    width = 10, height = 8)
  # Map average temperature
  tmp_plot <- ggplot(data = us_dt,
    aes(x = long, y = lat, group = group)) +
    geom_polygon(aes(fill = temp)) +
    coord_map("lambert", lat0 = 50, lat1 = 50) +
    scale_fill_gradientn("Temperature (F)",
      colors = colorRamps::matlab.like(100)) +
    theme_map() +
    theme(
      legend.position = "bottom",
      legend.direction = "horizontal",
      legend.key.width = unit(1, "in"))
  ggsave(filename = "plotTmp.jpg", plot = tmp_plot, path = dir_output,
    width = 10, height = 8)


# Load and prepare population data ---------------------------------------------
  # Load the county population file
  pop_dt <- fread(paste0(dir_csv, "populationCountyCensus2010.csv"))
  # Keep desired variables
  pop_dt <- pop_dt[, .(STATE, COUNTY, STNAME, CENSUS2010POP)]
  # Change names
  setnames(pop_dt, c("state", "county", "state_name", "pop"))
  # Drop observations where county = 0 (state totals)
  pop_dt <- pop_dt[county != 0]
  # Add 5-digit FIPS
  pop_dt[, fips := paste0(str_pad(state, 2, "left", 0),
    str_pad(county, 3, "left", 0))]
  # List of states east of the Mississippi River
  states_east <- c("Alabama", "Connecticut", "Delaware", "Florida", "Georgia",
   "Illinois", "Indiana", "Kentucky", "Maine", "Maryland", "Massachusetts",
   "Michigan", "Mississippi", "New Hampshire", "New Jersey", "New York",
   "North Carolina", "Ohio", "Pennsylvania", "Rhode Island", "South Carolina",
   "Tennessee", "Vermont", "Virginia", "West Virginia", "Wisconsin")
  # Add flag for states that are east of the Mississippi River
  pop_dt[, east := state_name %in% states_east]
  # Calculate region-based weights (for CA and "East")
  pop_ca <- pop_dt[state_name == "California"]$pop %>% sum(na.rm = T)
  pop_east <- pop_dt[east == T]$pop %>% sum(na.rm = T)
  pop_dt[state_name == "California", ca_wt := pop / pop_ca]
  pop_dt[east == T, east_wt := pop / pop_east]


# Construct CA and East regional daily weather ---------------------------------
# NOTE: If I ever want to use this longer time period, I should use estimated
# populations for each year (rather than 2010 populations).
  # Loop over the years
  regional_dt <- lapply(X = 1979:2015, FUN = function(the_year) {
    # Open the year's weather panel
    w_dt <- readRDS(paste0(dir_r, "DegreeDaysNARR/countyWeather",
      the_year, ".rds"))
    # Join the population data to the weather data
    w_dt %<>% merge(y = pop_dt, by = "fips", all.x = T, all.y  = F)
    # Split data into CA and East datasets; drop unwanted variables
    east_dt <- w_dt[east == T][,
      c("state", "county", "state_name", "east", "ca_wt") := NULL]
    ca_dt <- w_dt[state_name == "California"][,
      c("state", "county", "state_name", "east", "east_wt") := NULL]
    setnames(east_dt, "east_wt", "wt")
    setnames(ca_dt, "ca_wt", "wt")
    # Calculate daily summaries (weighting by population)
    east_dt <- east_dt[, list(
      avg_hdd = sum(hdd * wt),
      avg_cdd = sum(cdd * wt),
      avg_tmp = sum(temp * wt),
      region  = "east"
      ), by = date]
    ca_dt <- ca_dt[, list(
      avg_hdd = sum(hdd * wt),
      avg_cdd = sum(cdd * wt),
      avg_tmp = sum(temp * wt),
      region  = "ca"
      ), by = date]
    # Stack the data and return the data
    stack_dt <- rbindlist(list(east_dt, ca_dt))
    return(stack_dt)
    }) %>% rbindlist()


# Create lags ------------------------------------------------------------------
  # For each region and variable:
  #   0. Take region-based subset and ensure order
  #   1. Take lags 1-366
  #   2. Sum the lags for the past 31, 61, 92, 122, 153, 183, 214, 244,
  #      274, 305, 335, and 366 days (to get months)
  days <- c(0, 31, 61, 92, 122, 153, 183, 214, 244, 274, 305, 335, 366)
  # Create region-based subsets
  sub_ca   <- regional_dt[region == "ca"]
  sub_east <- regional_dt[region == "east"]
  # Ensure order by date
  setorder(sub_ca, date)
  setorder(sub_east, date)
  # Copy datasets
  tmp_ca <- sub_ca[, .(date, avg_tmp)]
  hdd_ca <- sub_ca[, .(date, avg_hdd)]
  cdd_ca <- sub_ca[, .(date, avg_cdd)]
  tmp_east <- sub_east[, .(date, avg_tmp)]
  hdd_east <- sub_east[, .(date, avg_hdd)]
  cdd_east <- sub_east[, .(date, avg_cdd)]
  # Calculate lags up to 366 days
  tmp_ca[, paste0("tmp", 1:366) :=
    data.table::shift(avg_tmp, 1L:366L, NA, "lag")]
  hdd_ca[, paste0("hdd", 1:366) :=
    data.table::shift(avg_hdd, 1L:366L, NA, "lag")]
  cdd_ca[, paste0("cdd", 1:366) :=
    data.table::shift(avg_cdd, 1L:366L, NA, "lag")]
  tmp_east[, paste0("tmp", 1:366) :=
    data.table::shift(avg_tmp, 1L:366L, NA, "lag")]
  hdd_east[, paste0("hdd", 1:366) :=
    data.table::shift(avg_hdd, 1L:366L, NA, "lag")]
  cdd_east[, paste0("cdd", 1:366) :=
    data.table::shift(avg_cdd, 1L:366L, NA, "lag")]
  # Loop over months to calculate monthly totals, adding as new variable
  for(i in 1:12) {
    n <- str_pad(i, 2, "left", 0)
    # Adding new variables to CA dataset
    set(x = sub_ca, j = paste0("avg_tmp_lag_month", n), value =
      (tmp_ca[, c(paste0("tmp", (days[i]+1):days[i+1])), with = F] %>%
        rowSums(na.rm = T)) / length((days[i]+1):days[i+1]))
    set(x = sub_ca, j = paste0("tot_hdd_lag_month", n), value =
      hdd_ca[, c(paste0("hdd", (days[i]+1):days[i+1])), with = F] %>%
      rowSums(na.rm = T))
    set(x = sub_ca, j = paste0("tot_cdd_lag_month", n), value =
      cdd_ca[, c(paste0("cdd", (days[i]+1):days[i+1])), with = F] %>%
      rowSums(na.rm = T))
    # Adding new variables to East dataset
    set(x = sub_east, j = paste0("avg_tmp_lag_month", n), value =
      (tmp_east[, c(paste0("tmp", (days[i]+1):days[i+1])), with = F] %>%
        rowSums(na.rm = T)) / length((days[i]+1):days[i+1]))
    set(x = sub_east, j = paste0("tot_hdd_lag_month", n), value =
      hdd_east[, c(paste0("hdd", (days[i]+1):days[i+1])), with = F] %>%
      rowSums(na.rm = T))
    set(x = sub_east, j = paste0("tot_cdd_lag_month", n), value =
      cdd_east[, c(paste0("cdd", (days[i]+1):days[i+1])), with = F] %>%
      rowSums(na.rm = T))
  }
  # Bind regions' data together again
  stack_dt <- rbindlist(list(sub_ca, sub_east), use.names = T, fill = T)
  # Drop 1979 and 1980 due to missing lags
  stack_dt <- stack_dt[year(date) > 1980]
  # Re-order columns
  tmp_names <- names(stack_dt)[!(names(stack_dt) %in% c("date", "region"))]
  setcolorder(stack_dt, c("date", "region", tmp_names[order(tmp_names)]))
  # Save dataset
  saveRDS(object = stack_dt, file = paste0(dir_r, "degreeDayNARR.rds"))
